# real targets
-data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
+data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.jl
$(MAKE) -C data utf8proc_data.c.new
utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
# Unicode data generation rules. Except for the test data files, most
# users will not use these Makefile rules, which are primarily to re-generate
# unicode_data.c when we get a new Unicode version or charwidth data; they
-# require ruby and julia to be installed.
+# require julia to be installed.
# programs
CURL=curl
-RUBY=ruby
PERL=perl
MAKE=make
JULIA=julia
.DELETE_ON_ERROR:
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
- $(RUBY) data_generator.rb < UnicodeData.txt > $@
+RAWDATA = UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt EastAsianWidth.txt emoji-data.txt
-CharWidths.txt: charwidths.jl EastAsianWidth.txt
- $(JULIA) charwidths.jl > $@
+utf8proc_data.c.new: data_generator.jl $(RAWDATA)
+ $(JULIA) --project=. -e 'using Pkg; Pkg.instantiate()'
+ $(JULIA) --project=. data_generator.jl > $@
# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=15.1.0
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
Uppercase.txt: DerivedCoreProperties.txt
- $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
+ $(JULIA) -e 'print(match(r"# Derived Property: Uppercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
Lowercase.txt: DerivedCoreProperties.txt
- $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
+ $(JULIA) -e 'print(match(r"# Derived Property: Lowercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
clean:
- rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
+ rm -f $(RAWDATA) NormalizationTest.txt GraphemeBreakTest.txt
rm -f Uppercase.txt Lowercase.txt
rm -f utf8proc_data.c.new
--- /dev/null
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.9.3"
+manifest_format = "2.0"
+project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
+
+[[deps.Adapt]]
+deps = ["LinearAlgebra", "Requires"]
+git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
+uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+version = "3.7.2"
+
+ [deps.Adapt.extensions]
+ AdaptStaticArraysExt = "StaticArrays"
+
+ [deps.Adapt.weakdeps]
+ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.CompilerSupportLibraries_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "1.0.5+0"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.LinearAlgebra]]
+deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[deps.OffsetArrays]]
+deps = ["Adapt"]
+git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
+uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+version = "1.12.10"
+
+[[deps.OpenBLAS_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
+uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
+version = "0.3.21+4"
+
+[[deps.Random]]
+deps = ["SHA", "Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.3.0"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.libblastrampoline_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
+version = "5.8.0+0"
--- /dev/null
+[deps]
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+++ /dev/null
-# Following work by @jiahao, we compute character widths using a combination of
-# * character category
-# * UAX 11: East Asian Width
-# * a few exceptions as needed
-# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
-#
-# We used to also use data from GNU Unifont, but that has proven unreliable
-# and unlikely to match widths assumed by terminals.
-#
-# Requires Julia (obviously) and FontForge.
-
-#############################################################################
-CharWidths = Dict{Int,Int}()
-
-#############################################################################
-# Use ../libutf8proc for category codes, rather than the one in Julia,
-# to minimize bootstrapping complexity when a new version of Unicode comes out.
-catcode(c) = ccall((:utf8proc_category,"../libutf8proc"), Cint, (Int32,), c)
-
-# utf8proc category constants (must match h)
-const UTF8PROC_CATEGORY_CN = 0
-const UTF8PROC_CATEGORY_LU = 1
-const UTF8PROC_CATEGORY_LL = 2
-const UTF8PROC_CATEGORY_LT = 3
-const UTF8PROC_CATEGORY_LM = 4
-const UTF8PROC_CATEGORY_LO = 5
-const UTF8PROC_CATEGORY_MN = 6
-const UTF8PROC_CATEGORY_MC = 7
-const UTF8PROC_CATEGORY_ME = 8
-const UTF8PROC_CATEGORY_ND = 9
-const UTF8PROC_CATEGORY_NL = 10
-const UTF8PROC_CATEGORY_NO = 11
-const UTF8PROC_CATEGORY_PC = 12
-const UTF8PROC_CATEGORY_PD = 13
-const UTF8PROC_CATEGORY_PS = 14
-const UTF8PROC_CATEGORY_PE = 15
-const UTF8PROC_CATEGORY_PI = 16
-const UTF8PROC_CATEGORY_PF = 17
-const UTF8PROC_CATEGORY_PO = 18
-const UTF8PROC_CATEGORY_SM = 19
-const UTF8PROC_CATEGORY_SC = 20
-const UTF8PROC_CATEGORY_SK = 21
-const UTF8PROC_CATEGORY_SO = 22
-const UTF8PROC_CATEGORY_ZS = 23
-const UTF8PROC_CATEGORY_ZL = 24
-const UTF8PROC_CATEGORY_ZP = 25
-const UTF8PROC_CATEGORY_CC = 26
-const UTF8PROC_CATEGORY_CF = 27
-const UTF8PROC_CATEGORY_CS = 28
-const UTF8PROC_CATEGORY_CO = 29
-
-#############################################################################
-# Use a default width of 1 for all character categories that are
-# letter/symbol/number-like, as well as for unassigned/private-use chars.
-# This can be overridden by UAX 11
-# below, but provides a useful nonzero fallback for new codepoints when
-# a new Unicode version has been released but Unifont hasn't been updated yet.
-
-zerowidth = Set{Int}() # categories that may contain zero-width chars
-push!(zerowidth, UTF8PROC_CATEGORY_MN)
-push!(zerowidth, UTF8PROC_CATEGORY_MC)
-push!(zerowidth, UTF8PROC_CATEGORY_ME)
-# push!(zerowidth, UTF8PROC_CATEGORY_SK) # see issue #167
-push!(zerowidth, UTF8PROC_CATEGORY_ZL)
-push!(zerowidth, UTF8PROC_CATEGORY_ZP)
-push!(zerowidth, UTF8PROC_CATEGORY_CC)
-push!(zerowidth, UTF8PROC_CATEGORY_CF)
-push!(zerowidth, UTF8PROC_CATEGORY_CS)
-for c in 0x0000:0x110000
- if catcode(c) ∉ zerowidth
- CharWidths[c] = 1
- end
-end
-
-#############################################################################
-# Widths from UAX #11: East Asian Width
-# .. these take precedence for all codepoints
-# listed explicitly as wide/full/narrow/half-width
-
-for line in readlines(open("EastAsianWidth.txt"))
- #Strip comments
- (isempty(line) || line[1] == '#') && continue
- precomment = split(line, '#')[1]
- #Parse code point range and width code
- tokens = split(precomment, ';')
- length(tokens) >= 2 || continue
- charrange = tokens[1]
- width = strip(tokens[2])
- #Parse code point range into Julia UnitRange
- rangetokens = split(charrange, "..")
- charstart = parse(UInt32, "0x"*rangetokens[1])
- charend = parse(UInt32, "0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])
-
- #Assign widths
- for c in charstart:charend
- if width=="W" || width=="F" # wide or full
- CharWidths[c]=2
- elseif width=="Na"|| width=="H"
- CharWidths[c]=1
- end
- end
-end
-
-#############################################################################
-# A few exceptions to the above cases, found by manual comparison
-# to other wcwidth functions and similar checks.
-
-for c in keys(CharWidths)
- cat = catcode(c)
-
- # make sure format control character (category Cf) have width 0
- # (some of these, like U+0601, can have a width in some cases
- # but normally act like prepended combining marks. U+fff9 etc
- # are also odd, but have zero width in typical terminal contexts)
- if cat==UTF8PROC_CATEGORY_CF
- CharWidths[c]=0
- end
-
- # Unifont has nonzero width for a number of non-spacing combining
- # characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and
- # the variation selectors
- if cat==UTF8PROC_CATEGORY_MN
- CharWidths[c]=0
- end
-
- # We also assign width of one to unassigned and private-use
- # codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
- # but since these are nonstandard it seems questionable to use Unifont metrics;
- # if they are printed as the replacement character U+FFFD they will have width 1).
- if cat==UTF8PROC_CATEGORY_CO || cat==UTF8PROC_CATEGORY_CN
- CharWidths[c]=1
- end
-
- # for some reason, Unifont has width-2 glyphs for ASCII control chars
- if cat==UTF8PROC_CATEGORY_CC
- CharWidths[c]=0
- end
-end
-
-#Soft hyphen is typically printed as a hyphen (-) in terminals.
-CharWidths[0x00ad]=1
-
-#By definition, should have zero width (on the same line)
-#0x002028 '
' category: Zl name: LINE SEPARATOR/
-#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
-CharWidths[0x2028]=0
-CharWidths[0x2029]=0
-
-#############################################################################
-# Output (to a file or pipe) for processing by data_generator.rb,
-# encoded as a sequence of intervals.
-
-firstc = 0x000000
-lastv = 0
-uhex(c) = uppercase(string(c,base=16,pad=4))
-for c in 0x0000:0x110000
- global firstc, lastv
- v = get(CharWidths, c, 0)
- if v != lastv || c == 0x110000
- v < 4 || error("invalid charwidth $v for $c")
- if firstc+1 < c
- println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
- else
- println(uhex(firstc), "; ", lastv)
- end
- firstc = c
- lastv = v
- end
-end
--- /dev/null
+using OffsetArrays: Origin
+
+parsehex(str) = parse(UInt32, str, base=16)
+
+function parse_hex_range(line)
+ m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line)
+ if isnothing(m)
+ return nothing
+ end
+ i = parsehex(m[1])
+ j = !isnothing(m[3]) ? parsehex(m[3]) : i
+ desc = rstrip(m[4])
+ return (i:j, desc)
+end
+
+function read_hex_ranges(filename)
+ [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)]
+end
+
+function collect_codepoints(range_desc, description)
+ list = UInt32[]
+ for (r,d) in range_desc
+ if d == description
+ append!(list, r)
+ end
+ end
+ list
+end
+
+function set_all!(d, keys, value)
+ for k in keys
+ d[k] = value
+ end
+end
+
+#-------------------------------------------------------------------------------
+
+derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt")
+
+ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point"))
+uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase"))
+lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase"))
+
+
+#-------------------------------------------------------------------------------
+function derive_indic_conjunct_break(derived_core_properties)
+ props = Dict{UInt32, String}()
+ set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"), "LINKER")
+ set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT")
+ set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"), "EXTEND")
+ props
+end
+
+let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties)
+ global function get_indic_conjunct_break(code)
+ get(indic_conjunct_break, code, "NONE")
+ end
+end
+
+#-------------------------------------------------------------------------------
+function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename)
+ grapheme_boundclass = Dict{UInt32, String}()
+ for (r,desc) in read_hex_ranges(grapheme_break_filename)
+ set_all!(grapheme_boundclass, r, Base.uppercase(desc))
+ end
+ for (r,desc) in read_hex_ranges(emoji_data_filename)
+ if desc == "Extended_Pictographic"
+ set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC")
+ elseif desc == "Emoji_Modifier"
+ set_all!(grapheme_boundclass, r, "EXTEND")
+ end
+ end
+ return grapheme_boundclass
+end
+
+let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt")
+ global function get_grapheme_boundclass(code)
+ get(grapheme_boundclasses, code, "OTHER")
+ end
+end
+
+#-------------------------------------------------------------------------------
+function read_composition_exclusions(pattern)
+ section = match(pattern, read("CompositionExclusions.txt",String)).match
+ es = UInt32[]
+ for line in split(section, '\n')
+ m = match(r"^([0-9A-F]+) +#"i, line)
+ if !isnothing(m)
+ push!(es, parsehex(m[1]))
+ end
+ end
+ es
+end
+
+exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s))
+excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s))
+
+# FIXME: Replicate a bug in the ruby code
+push!(exclusions, 0)
+push!(excl_version, 0)
+
+#-------------------------------------------------------------------------------
+function read_case_folding(filename)
+ case_folding = Dict{UInt32,Vector{UInt32}}()
+ for line in readlines(filename)
+ m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line)
+ !isnothing(m) || continue
+ case_folding[parsehex(m[1])] = parsehex.(split(m[2]))
+ end
+ case_folding
+end
+
+let case_folding = read_case_folding("CaseFolding.txt")
+ global function get_case_folding(code)
+ get(case_folding, code, nothing)
+ end
+end
+
+#-------------------------------------------------------------------------------
+# Utilities for reading per-char properties from UnicodeData.txt
+function split_unicode_data_line(line)
+ m = match(r"""
+ ([0-9A-F]+); # code
+ ([^;]+); # name
+ ([A-Z]+); # general category
+ ([0-9]+); # canonical combining class
+ ([A-Z]+); # bidi class
+ (<([A-Z]*)>)? # decomposition type
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
+ ([0-9]*); # decimal digit
+ ([0-9]*); # digit
+ ([^;]*); # numeric
+ ([YN]*); # bidi mirrored
+ ([^;]*); # unicode 1.0 name
+ ([^;]*); # iso comment
+ ([0-9A-F]*); # simple uppercase mapping
+ ([0-9A-F]*); # simple lowercase mapping
+ ([0-9A-F]*)$ # simple titlecase mapping
+ """ix, line)
+ @assert !isnothing(m)
+ code = parse(UInt32, m[1], base=16)
+ (code = code,
+ name = m[2],
+ category = m[3],
+ combining_class = parse(Int, m[4]),
+ bidi_class = m[5],
+ decomp_type = m[7],
+ decomp_mapping = m[8] == "" ? nothing : parsehex.(split(m[8])),
+ bidi_mirrored = m[13] == "Y",
+ # issue #130: use nonstandard uppercase ß -> ẞ
+ # issue #195: if character is uppercase but has no lowercase mapping,
+ # then make lowercase mapping = itself (vice versa for lowercase)
+ uppercase_mapping = m[16] != "" ? parsehex(m[16]) :
+ code == 0x000000df ? 0x00001e9e :
+ m[17] == "" && code in lowercase ? code :
+ nothing,
+ lowercase_mapping = m[17] != "" ? parsehex(m[17]) :
+ m[16] == "" && code in uppercase ? code :
+ nothing,
+ titlecase_mapping = m[18] != "" ? parsehex(m[18]) :
+ code == 0x000000df ? 0x00001e9e :
+ nothing,
+ )
+end
+
+function read_unicode_data(filename)
+ raw_char_props = split_unicode_data_line.(readlines(filename))
+ char_props = Origin(0)(Vector{eltype(raw_char_props)}())
+ @assert issorted(raw_char_props, by=c->c.code)
+ raw_char_props = Iterators.Stateful(raw_char_props)
+ while !isempty(raw_char_props)
+ c = popfirst!(raw_char_props)
+ if occursin(", First>", c.name)
+ nc = popfirst!(raw_char_props)
+ @assert occursin(", Last>", nc.name)
+ name = replace(c.name, ", First"=>"")
+ for i in c.code:nc.code
+ push!(char_props, (; c..., name=name, code=i))
+ end
+ else
+ push!(char_props, c)
+ end
+ end
+ return char_props
+end
+
+char_props = read_unicode_data("UnicodeData.txt")
+char_hash = Dict(c.code=>c for c in char_props)
+
+#-------------------------------------------------------------------------------
+# Read character widths from UAX #11: East Asian Width
+function read_east_asian_widths(filename)
+ ea_widths = Dict{UInt32,Int}()
+ for (rng,widthcode) in read_hex_ranges(filename)
+ w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
+ widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
+ nothing
+ if !isnothing(w)
+ set_all!(ea_widths, rng, w)
+ end
+ end
+ return ea_widths
+end
+
+let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
+ # Following work by @jiahao, we compute character widths using a combination of
+ # * character category
+ # * UAX 11: East Asian Width
+ # * a few exceptions as needed
+ # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
+ global function derive_char_width(code, category)
+ # Use a default width of 1 for all character categories that are
+ # letter/symbol/number-like, as well as for unassigned/private-use chars.
+ # This provides a useful nonzero fallback for new codepoints when a new
+ # Unicode version has been released.
+ width = 1
+
+ # Various zero-width categories
+ #
+ # "Sk" not included in zero width - see issue #167
+ if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
+ width = 0
+ end
+
+ # Widths from UAX #11: East Asian Width
+ eaw = get(ea_widths, code, nothing)
+ if !isnothing(eaw)
+ width = eaw
+ end
+
+ # A few exceptional cases, found by manual comparison to other wcwidth
+ # functions and similar checks.
+ if category == "Mn"
+ width = 0
+ end
+
+ if code == 0x00ad
+ # Soft hyphen is typically printed as a hyphen (-) in terminals.
+ width = 1
+ elseif code == 0x2028 || code == 0x2029
+ #By definition, should have zero width (on the same line)
+ #0x002028 '
' category: Zl name: LINE SEPARATOR/
+ #0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
+ width = 0
+ end
+
+ return width
+ end
+end
+
+#-------------------------------------------------------------------------------
+# Construct data tables which will drive libutf8proc
+#
+# These tables are "compressed" with an ad-hoc compression scheme (largely some
+# simple deduplication and indexing) which can easily and efficiently be
+# decompressed on the C side at runtime.
+
+# Inverse decomposition mapping tables for combining two characters into a single one.
+comb1st_indices = Dict{UInt32,Int}()
+comb1st_indices_sorted_keys = Origin(0)(UInt32[])
+comb2nd_indices = Dict{UInt32,Int}()
+comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
+comb2nd_indices_nonbasic = Set{UInt32}()
+comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
+for char in char_props
+ if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
+ length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
+ char_hash[char.decomp_mapping[1]].combining_class == 0 &&
+ char.code ∉ exclusions
+ dm0 = char.decomp_mapping[1]
+ dm1 = char.decomp_mapping[2]
+ if !haskey(comb1st_indices, dm0)
+ comb1st_indices[dm0] = length(comb1st_indices)
+ push!(comb1st_indices_sorted_keys, dm0)
+ push!(comb_array, Dict{Int,UInt32}())
+ @assert length(comb1st_indices) == length(comb_array)
+ end
+ if !haskey(comb2nd_indices, dm1)
+ push!(comb2nd_indices_sorted_keys, dm1)
+ comb2nd_indices[dm1] = length(comb2nd_indices)
+ end
+ @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
+ comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
+ if char.code > 0xFFFF
+ push!(comb2nd_indices_nonbasic, dm1)
+ end
+ end
+end
+
+comb_indices = Dict{UInt32,Int}()
+comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
+comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
+let
+ cumoffset = 0
+ for dm0 in comb1st_indices_sorted_keys
+ index = comb1st_indices[dm0]
+ first = nothing
+ last = nothing
+ offset = 0
+ for b in eachindex(comb2nd_indices_sorted_keys)
+ dm1 = comb2nd_indices_sorted_keys[b]
+ if haskey(comb_array[index], b)
+ if isnothing(first)
+ first = offset
+ end
+ last = offset
+ if dm1 in comb2nd_indices_nonbasic
+ last += 1
+ end
+ end
+ offset += 1
+ if dm1 in comb2nd_indices_nonbasic
+ offset += 1
+ end
+ end
+ comb1st_indices_firstoffsets[index] = first
+ comb1st_indices_lastoffsets[index] = last
+ @assert !haskey(comb_indices, dm0)
+ comb_indices[dm0] = cumoffset
+ cumoffset += last - first + 1 + 2
+ end
+
+ offset = 0
+ for dm1 in comb2nd_indices_sorted_keys
+ @assert !haskey(comb_indices, dm1)
+ comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
+ @assert comb2nd_indices[dm1] + offset <= 0x4000
+ if dm1 in comb2nd_indices_nonbasic
+ comb_indices[dm1] |= 0x4000
+ offset += 1
+ end
+ end
+end
+
+utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))
+
+# Utility for packing all UTF-16 encoded sequences into one big array
+struct UTF16Sequences
+ storage::Vector{UInt16}
+ indices::Dict{Vector{UInt16},Int}
+end
+UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}())
+
+"""
+Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where
+* The 14 low bits are the index into the `sequences.storage` array where the
+ sequence resides
+* The two top bits are the length of the sequence, or if equal to 3, the first
+ entry of the sequence itself contains the length.
+"""
+function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector)
+ if length(utf32_seq) == 0
+ return typemax(UInt16)
+ end
+ # lencode contains the length of the UTF-32 sequence after decoding
+ # No sequence has len 0, so we encode len 1 as 0, len 2 as 1.
+ # We have only 2 bits for the length, though, so longer sequences are
+ # encoded in the sequence data itself.
+ seq_lencode = length(utf32_seq) - 1
+ utf16_seq = utf16_encode(utf32_seq)
+ idx = get!(sequences.indices, utf16_seq) do
+ i = length(sequences.storage)
+ utf16_seq_enc = seq_lencode < 3 ? utf16_seq :
+ pushfirst!(copy(utf16_seq), seq_lencode)
+ append!(sequences.storage, utf16_seq_enc)
+ i
+ end
+ @assert idx <= 0x3FFF
+ seq_code = idx | (min(seq_lencode, 3) << 14)
+ return seq_code
+end
+
+function encode_sequence!(sequences::UTF16Sequences, code::Integer)
+ encode_sequence!(sequences, [code])
+end
+
+function encode_sequence!(sequences::UTF16Sequences, ::Nothing)
+ return typemax(UInt16)
+end
+
+function char_table_properties!(sequences, char)
+ code = char.code
+
+ return (
+ category = char.category,
+ combining_class = char.combining_class,
+ bidi_class = char.bidi_class,
+ decomp_type = char.decomp_type,
+ decomp_seqindex = encode_sequence!(sequences, char.decomp_mapping),
+ casefold_seqindex = encode_sequence!(sequences, get_case_folding(code)),
+ uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
+ lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
+ titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
+ comb_index = get(comb_indices, code, typemax(UInt16)),
+ bidi_mirrored = char.bidi_mirrored,
+ comp_exclusion = code in exclusions || code in excl_version,
+ ignorable = code in ignorable,
+ control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") &&
+ # FIXME: Ruby bug compat - should be `code in (0x200C, 0x200D)`
+ !(char.category in (0x200C, 0x200D)),
+ charwidth = derive_char_width(code, char.category),
+ boundclass = get_grapheme_boundclass(code),
+ indic_conjunct_break = get_indic_conjunct_break(code),
+ )
+end
+
+# Many character properties are duplicates. Deduplicate them, constructing a
+# per-character array of indicies into the properties array
+sequences = UTF16Sequences()
+
+# FIXME: Hack to force ordering compat with Ruby code
+for c in char_props
+ encode_sequence!(sequences, c.decomp_mapping)
+ encode_sequence!(sequences, get_case_folding(c.code))
+end
+
+char_table_props = [char_table_properties!(sequences, cp) for cp in char_props]
+
+deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}())
+char_property_indices = Origin(0)(zeros(Int, 0x00110000))
+let index_map = Dict{eltype(char_table_props),Int}()
+ for (char, table_props) in zip(char_props, char_table_props)
+ entry_idx = get!(index_map, table_props) do
+ idx = length(deduplicated_props)
+ push!(deduplicated_props, table_props)
+ idx
+ end
+ # Add 1 because unassigned codes occupy slot at index 0
+ char_property_indices[char.code] = entry_idx + 1
+ end
+end
+
+# Now compress char_property_indices by breaking it into pages and
+# deduplicating those (this works as compression because there are large
+# contiguous ranges of code space with identical properties)
+prop_page_indices = Int[]
+prop_pages = Int[]
+let
+ page_size = 0x100
+ page_index_map = Dict{Vector{Int}, Int}()
+ for page in Iterators.partition(char_property_indices, page_size)
+ page_idx = get!(page_index_map, page) do
+ idx = length(prop_pages)
+ append!(prop_pages, page)
+ idx
+ end
+ push!(prop_page_indices, page_idx)
+ end
+end
+
+#-------------------------------------------------------------------------------
+function write_c_index_array(io, array, linelen)
+ print(io, "{\n ")
+ i = 0
+ for x in array
+ i += 1
+ if i == linelen
+ i = 0
+ print(io, "\n ")
+ end
+ print(io, x, ", ")
+ end
+ print(io, "};\n\n")
+end
+
+function c_enum_name(prefix, str)
+ if isnothing(str)
+ return "0"
+ else
+ return "UTF8PROC_$(prefix)_$(Base.uppercase(str))"
+ end
+end
+
+function c_uint16(seqindex)
+ if seqindex == typemax(UInt16)
+ return "UINT16_MAX"
+ else
+ return string(seqindex)
+ end
+end
+
+function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
+ comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
+ comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+ print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
+ write_c_index_array(io, sequences.storage, 8)
+ print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
+ write_c_index_array(io, prop_page_indices, 8)
+ print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ")
+ write_c_index_array(io, prop_pages, 8)
+
+ print(io, """
+ static const utf8proc_property_t utf8proc_properties[] = {
+ {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
+ """)
+ for prop in deduplicated_props
+ print(io, " {",
+ c_enum_name("CATEGORY", prop.category), ", ",
+ prop.combining_class, ", ",
+ c_enum_name("BIDI_CLASS", prop.bidi_class), ", ",
+ c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ",
+ c_uint16(prop.decomp_seqindex), ", ",
+ c_uint16(prop.casefold_seqindex), ", ",
+ c_uint16(prop.uppercase_seqindex), ", ",
+ c_uint16(prop.lowercase_seqindex), ", ",
+ c_uint16(prop.titlecase_seqindex), ", ",
+ c_uint16(prop.comb_index), ", ",
+ prop.bidi_mirrored, ", ",
+ prop.comp_exclusion, ", ",
+ prop.ignorable, ", ",
+ prop.control_boundary, ", ",
+ prop.charwidth, ", ",
+ "0, ", # bitfield padding
+ c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
+ c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
+ "},\n"
+ )
+ end
+ print(io, "};\n\n")
+
+ print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ")
+ i = 0
+ for a in eachindex(comb1st_indices_firstoffsets)
+ offset = 0
+ print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
+ for b in eachindex(comb2nd_indices_sorted_keys)
+ dm1 = comb2nd_indices_sorted_keys[b]
+ if offset > comb1st_indices_lastoffsets[a]
+ break
+ end
+ if offset >= comb1st_indices_firstoffsets[a]
+ i += 1
+ if i == 8
+ i = 0
+ print(io, "\n ")
+ end
+ v = get(comb_array[a], b, 0)
+ if dm1 in comb2nd_indices_nonbasic
+ print(io, (v & 0xFFFF0000) >> 16, ", ")
+ end
+ print(io, v & 0xFFFF, ", ")
+ end
+ offset += 1
+ if dm1 in comb2nd_indices_nonbasic
+ offset += 1
+ end
+ end
+ print(io, "\n")
+ end
+ print(io, "};\n\n")
+end
+
+
+if !isinteractive()
+ print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
+ comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
+ comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+end
+
+++ /dev/null
-#!/usr/bin/env ruby
-
-# This file was used to generate the 'unicode_data.c' file by parsing the
-# Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
-# It is included for informational purposes only and not intended for
-# production use.
-
-
-# Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
-# Benito van der Zander, Michaël Meyer, and other contributors.
-# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-# This file contains derived data from a modified version of the
-# Unicode data files. The following license applies to that data:
-#
-# COPYRIGHT AND PERMISSION NOTICE
-#
-# Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
-# under the Terms of Use in http://www.unicode.org/copyright.html.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of the Unicode data files and any associated documentation (the "Data
-# Files") or Unicode software and any associated documentation (the
-# "Software") to deal in the Data Files or Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, and/or sell copies of the Data Files or Software, and
-# to permit persons to whom the Data Files or Software are furnished to do
-# so, provided that (a) the above copyright notice(s) and this permission
-# notice appear with all copies of the Data Files or Software, (b) both the
-# above copyright notice(s) and this permission notice appear in associated
-# documentation, and (c) there is clear notice in each modified Data File or
-# in the Software as well as in the documentation associated with the Data
-# File(s) or Software that the data or software has been modified.
-#
-# THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
-# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
-# THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
-# INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
-# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
-# USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-# PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-#
-# Except as contained in this notice, the name of a copyright holder shall
-# not be used in advertising or otherwise to promote the sale, use or other
-# dealings in these Data Files or Software without prior written
-# authorization of the copyright holder.
-
-
-$ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
-$ignorable = []
-$ignorable_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
- elsif entry =~ /^[0-9A-F]+/
- $ignorable << $&.hex
- end
-end
-
-$uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m]
-$uppercase = []
-$uppercase_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $uppercase << e2 }
- elsif entry =~ /^[0-9A-F]+/
- $uppercase << $&.hex
- end
-end
-
-$lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m]
-$lowercase = []
-$lowercase_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $lowercase << e2 }
- elsif entry =~ /^[0-9A-F]+/
- $lowercase << $&.hex
- end
-end
-
-$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
-$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
-$icb_linker_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
- elsif entry =~ /^[0-9A-F]+/
- $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
- end
-end
-$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
-$icb_consonant_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
- elsif entry =~ /^[0-9A-F]+/
- $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
- end
-end
-$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
-$icb_extend_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
- elsif entry =~ /^[0-9A-F]+/
- $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
- end
-end
-
-$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
-$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
-$grapheme_boundclass_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
- $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
- elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
- $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
- end
-end
-
-$emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8')
-$emoji_data_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
- $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
- elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
- $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
- elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
- $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
- elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
- $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
- end
-end
-
-$charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8')
-$charwidth = Hash.new(0)
-$charwidth_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
- $1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
- elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
- $charwidth[$1.hex] = $2.to_i
- end
-end
-
-$exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m]
-$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
-
-$excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
-$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
-
-$case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8')
-$case_folding = {}
-$case_folding_string.chomp.split("\n").each do |line|
- next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
- $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
-end
-
-$int_array = []
-$int_array_indicies = {}
-
-def str2c(string, prefix)
- return "0" if string.nil?
- return "UTF8PROC_#{prefix}_#{string.upcase}"
-end
-def pushary(array)
- idx = $int_array_indicies[array]
- unless idx
- $int_array_indicies[array] = $int_array.length
- idx = $int_array.length
- array.each { |entry| $int_array << entry }
- end
- return idx
-end
-def cpary2utf16encoded(array)
- return array.flat_map { |cp|
- if (cp <= 0xFFFF)
- raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000
- cp
- else
- temp = cp - 0x10000
- [(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000]
- end
- }
-end
-def cpary2c(array)
- return "UINT16_MAX" if array.nil? || array.length == 0
- lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
- array = cpary2utf16encoded(array)
- if lencode >= 3 #we have only 2 bits for the length
- array = [lencode] + array
- lencode = 3
- end
- idx = pushary(array)
- raise "Array index out of bound" if idx > 0x3FFF
- return "#{idx | (lencode << 14)}"
-end
-def singlecpmap(cp)
- return "UINT16_MAX" if cp == nil
- idx = pushary(cpary2utf16encoded([cp]))
- raise "Array index out of bound" if idx > 0xFFFF
- return "#{idx}"
-end
-
-class UnicodeChar
- attr_accessor :code, :name, :category, :combining_class, :bidi_class,
- :decomp_type, :decomp_mapping,
- :bidi_mirrored,
- :uppercase_mapping, :lowercase_mapping, :titlecase_mapping,
- #caches:
- :c_entry_index, :c_decomp_mapping, :c_case_folding
- def initialize(line)
- raise "Could not parse input." unless line =~ /^
- ([0-9A-F]+); # code
- ([^;]+); # name
- ([A-Z]+); # general category
- ([0-9]+); # canonical combining class
- ([A-Z]+); # bidi class
- (<([A-Z]*)>)? # decomposition type
- ((\ ?[0-9A-F]+)*); # decompomposition mapping
- ([0-9]*); # decimal digit
- ([0-9]*); # digit
- ([^;]*); # numeric
- ([YN]*); # bidi mirrored
- ([^;]*); # unicode 1.0 name
- ([^;]*); # iso comment
- ([0-9A-F]*); # simple uppercase mapping
- ([0-9A-F]*); # simple lowercase mapping
- ([0-9A-F]*)$/ix # simple titlecase mapping
- @code = $1.hex
- @name = $2
- @category = $3
- @combining_class = Integer($4)
- @bidi_class = $5
- @decomp_type = $7
- @decomp_mapping = ($8=='') ? nil :
- $8.split.collect { |element| element.hex }
- @bidi_mirrored = ($13=='Y') ? true : false
- # issue #130: use nonstandard uppercase ß -> ẞ
- # issue #195: if character is uppercase but has no lowercase mapping,
- # then make lowercase mapping = itself (vice versa for lowercase)
- @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
- @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
- @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
- end
- def case_folding
- $case_folding[code]
- end
- def c_entry(comb_indicies)
- " " <<
- "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
- "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
- "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
- "#{c_decomp_mapping}, " <<
- "#{c_case_folding}, " <<
- "#{singlecpmap uppercase_mapping }, " <<
- "#{singlecpmap lowercase_mapping }, " <<
- "#{singlecpmap titlecase_mapping }, " <<
- "#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " <<
- "#{bidi_mirrored}, " <<
- "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
- "#{$ignorable.include?(code)}, " <<
- "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
- "#{$charwidth[code]}, 0, " <<
- "#{$grapheme_boundclass[code]}, " <<
- "#{$icb[code]}},\n"
- end
-end
-
-chars = []
-char_hash = {}
-
-while gets
- if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
- first = $1.hex
- gets
- char = UnicodeChar.new($_)
- raise "No last character of sequence found." unless
- $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
- last = $1.hex
- name = "<#{$2}>"
- for i in first..last
- char_clone = char.clone
- char_clone.code = i
- char_clone.name = name
- char_hash[char_clone.code] = char_clone
- chars << char_clone
- end
- else
- char = UnicodeChar.new($_)
- char_hash[char.code] = char
- chars << char
- end
-end
-
-comb1st_indicies = {}
-comb2nd_indicies = {}
-comb2nd_indicies_sorted_keys = []
-comb2nd_indicies_nonbasic = {}
-comb_array = []
-
-chars.each do |char|
- if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
- char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
- char_hash[char.decomp_mapping[0]].combining_class == 0 and
- not $exclusions.include?(char.code)
-
- dm0 = char.decomp_mapping[0]
- dm1 = char.decomp_mapping[1]
- unless comb1st_indicies[dm0]
- comb1st_indicies[dm0] = comb1st_indicies.keys.length
- end
- unless comb2nd_indicies[dm1]
- comb2nd_indicies_sorted_keys << dm1
- comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
- end
- comb_array[comb1st_indicies[dm0]] ||= []
- raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
- comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
-
- comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
- end
- char.c_decomp_mapping = cpary2c(char.decomp_mapping)
- char.c_case_folding = cpary2c(char.case_folding)
-end
-
-comb_indicies = {}
-cumoffset = 0
-comb1st_indicies_lastoffsets = []
-comb1st_indicies_firstoffsets = []
-comb1st_indicies.each do |dm0, index|
- first = nil
- last = nil
- offset = 0
- comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
- if comb_array[index][b]
- first = offset unless first
- last = offset
- last += 1 if comb2nd_indicies_nonbasic[dm1]
- end
- offset += 1
- offset += 1 if comb2nd_indicies_nonbasic[dm1]
- end
- comb1st_indicies_firstoffsets[index] = first
- comb1st_indicies_lastoffsets[index] = last
- raise "double index" if comb_indicies[dm0]
- comb_indicies[dm0] = cumoffset
- cumoffset += last - first + 1 + 2
-end
-
-offset = 0
-comb2nd_indicies_sorted_keys.each do |dm1|
- raise "double index" if comb_indicies[dm1]
- comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset)
- raise "too large comb index" if comb2nd_indicies[dm1] + offset > 0x4000
- if comb2nd_indicies_nonbasic[dm1]
- comb_indicies[dm1] = comb_indicies[dm1] | 0x4000
- offset += 1
- end
-end
-
-properties_indicies = {}
-properties = []
-chars.each do |char|
- c_entry = char.c_entry(comb_indicies)
- char.c_entry_index = properties_indicies[c_entry]
- unless char.c_entry_index
- properties_indicies[c_entry] = properties.length
- char.c_entry_index = properties.length
- properties << c_entry
- end
-end
-
-stage1 = []
-stage2 = []
-for code in 0...0x110000
- next unless code % 0x100 == 0
- stage2_entry = []
- for code2 in code...(code+0x100)
- if char_hash[code2]
- stage2_entry << (char_hash[code2].c_entry_index + 1)
- else
- stage2_entry << 0
- end
- end
- old_index = stage2.index(stage2_entry)
- if old_index
- stage1 << (old_index * 0x100)
- else
- stage1 << (stage2.length * 0x100)
- stage2 << stage2_entry
- end
-end
-
-$stdout << "static const utf8proc_uint16_t utf8proc_sequences[] = {\n "
-i = 0
-$int_array.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_uint16_t utf8proc_stage1table[] = {\n "
-i = 0
-stage1.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_uint16_t utf8proc_stage2table[] = {\n "
-i = 0
-stage2.flatten.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
-properties.each { |line|
- $stdout << line
-}
-$stdout << "};\n\n"
-
-
-
-$stdout << "static const utf8proc_uint16_t utf8proc_combinations[] = {\n "
-i = 0
-comb1st_indicies.keys.each_index do |a|
- offset = 0
- $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
- comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
- break if offset > comb1st_indicies_lastoffsets[a]
- if offset >= comb1st_indicies_firstoffsets[a]
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- v = comb_array[a][b] ? comb_array[a][b] : 0
- $stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1]
- $stdout << (v & 0xFFFF) << ", "
- end
- offset += 1
- offset += 1 if comb2nd_indicies_nonbasic[dm1]
- end
- $stdout << "\n"
-end
-$stdout << "};\n\n"